In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

from utils import *
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 100)
matplotlib.rcParams.update({"font.size": 16,'lines.linewidth': 2.5})
# matplotlib.rcdefaults()
figures saved to ./tmp/figures
In [2]:
DATA_DIR = '../data/'
dfj = get_df(DATA_DIR + 'pai_job_table.csv')
dft = get_df(DATA_DIR + 'pai_task_table.csv')
dfi = get_df(DATA_DIR + 'pai_instance_table.csv')
dfg = get_df(DATA_DIR + 'pai_group_tag_table.csv')

dfa = get_dfa(dft, dfj, dfi, dfg)
dft + dfj ...
dft + dfj + dfi ...
dft + dfj + dfi + dfg ...
In [3]:
data_df = dfa
data_df_1 = data_df[(data_df.plan_gpu.isnull() == False)]

Percentage of tasks for each task name, grouped by GPU¶

In [4]:
misc_task_useage = data_df_1[data_df_1.gpu_type == "MISC"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "MISC"])
p100_task_useage = data_df_1[data_df_1.gpu_type == "P100"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "P100"])
t4_task_useage = data_df_1[data_df_1.gpu_type == "T4"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "T4"])
v100_task_useage = data_df_1[data_df_1.gpu_type == "V100"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "V100"])
v100m32_task_useage = data_df_1[data_df_1.gpu_type == "V100M32"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "V100M32"])

gpu_task_useage = pd.concat([misc_task_useage.job_name, 
                             p100_task_useage.job_name, 
                             t4_task_useage.job_name, 
                             v100_task_useage.job_name, 
                             v100m32_task_useage.job_name], axis=1)

gpu_task_useage.columns = ['MISC_task_useage', 'P100_task_useage', 'T4_task_useage', 'V100_task_useage', 'V100M32_task_useage']

gpu_task_useage = gpu_task_useage.fillna(0)
In [5]:
gpu_task_useage.reset_index(inplace=True)
In [6]:
gpu_task_useage_melt = pd.melt(gpu_task_useage, id_vars=["task_name"], value_name='percentage')
fig = px.bar(gpu_task_useage_melt, x=gpu_task_useage_melt.percentage, y = gpu_task_useage_melt.task_name, width=1000, height=700, color='variable', barmode='group')
fig.show()

Percentage of total tasks being shared by each task type¶

In [7]:
data_df_2 = data_df_1
# plan non shared:
data_df_2_shared = data_df_2[data_df_2.plan_gpu % 100 > 0]
# plan shared:
data_df_2_non_shared = data_df_2[data_df_2.plan_gpu % 100 == 0]
data_df_2_shared.shared = True

data_df_2_shared.assign(Name='shared')
data_df_2_shared["shared"] = True
data_df_2_non_shared.assign(Name='shared')
data_df_2_non_shared["shared"] = False
data_df_2 = pd.concat([data_df_2_shared, data_df_2_non_shared], ignore_index=True)
C:\Users\lhtMi\AppData\Local\Temp\ipykernel_24908\1696230573.py:9: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\lhtMi\AppData\Local\Temp\ipykernel_24908\1696230573.py:11: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [8]:
temp = data_df_2.groupby(['task_name', 'shared']).count().reset_index()

temp.job_name = temp.job_name/temp.job_name.sum()

fig = px.histogram(temp, x="task_name", y="job_name",
             color='shared', barmode='group',
             height=400)

fig.update_layout(
    title="Percentage of total tasks being shared by each task type",
    xaxis_title="Task names",
    yaxis_title="Percentage of total tasks",
    legend_title="Using Shared GPU"
)
fig.show()

Percentage of total tasks being shared by each GPU type¶

In [9]:
# "Percentage of GPU being shared"
temp = data_df_2.groupby(['gpu_type', 'shared']).count().reset_index()

temp.job_name = temp.job_name/temp.job_name.sum()

fig = px.histogram(temp, x="gpu_type", y="job_name",
             color='shared', barmode='group',
             height=400)

fig.update_layout(
    title="Percentage of total tasks being shared by each GPU type",
    xaxis_title="GPU type",
    yaxis_title="Percentage of total tasks",
    legend_title="Using Shared GPU"
)
fig.show()

Percentage of total tasks on each GPU¶

Note that this plot only works under Jupyter Notebook or Jupyter Lab

In [10]:
# "Percentage of GPU being shared"

def plot_percentage_shared_by_GPU(gpu_name = "MISC"):

    data_df_2.groupby(['gpu_type', "task_name", 'shared'])

    temp = data_df_2.groupby(['gpu_type', "task_name", 'shared']).count().reset_index()

    temp.job_name = temp.job_name/temp.job_name.sum()

    fig = px.histogram(temp[temp.gpu_type == gpu_name], x="shared", y="job_name",
                 color='task_name', 
                 height=400, width=400)

    fig.update_layout(
        title="Percentage of total tasks on " + gpu_name,
        xaxis_title="Using Shared GPU",
        yaxis_title="Percentage of total tasks",
        legend_title="Task name"
    )
    fig.show()
    
In [23]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
v = interact(plot_percentage_shared_by_GPU, gpu_name=['T4','MISC','P100','V100','V100M32'])
display(v)
interactive(children=(Dropdown(description='gpu_name', index=1, options=('T4', 'MISC', 'P100', 'V100', 'V100M3…
<function __main__.plot_percentage_shared_by_GPU(gpu_name='MISC')>

3d Scatter plot, grouped by task name¶

In [12]:
df_summarize_resource = data_df_2.groupby(['plan_gpu','plan_cpu', 'plan_mem', 'task_name']).count()
df_summarize_resource = df_summarize_resource.drop(columns=['inst_num', 'status', 'start_time', 'end_time', 'gpu_type',
       'inst_id', 'user', 'status_j', 'start_time_j', 'end_time_j', 'runtime',
       'status_i', 'start_time_i', 'end_time_i', 'runtime_i', 'duration_min',
       'wait_time', 'start_date', 'gpu_type_spec', 'group', 'workload',
       'shared'])
df_summarize_resource = df_summarize_resource.reset_index()


df_summarize_resource = df_summarize_resource.rename(columns={"job_name": "count"})
def set_marker_size(x):
    x = x["count"]
    if x < 100:
        return 1
    elif x < 500:
        return 2
    elif x < 1000:
        return 3
    elif x < 2000:
        return 4
    else:
        return 5
df_summarize_resource['marker_size'] = df_summarize_resource.apply(lambda x: set_marker_size(x), axis=1)

fig = px.scatter_3d(df_summarize_resource, x='plan_gpu', y='plan_cpu', z='plan_mem',
          color='task_name', hover_data=["count"], opacity=0.7, size = "marker_size", width=1200, height=800)
fig.show()

3d Scatter plot, grouped by gpu type¶

In [13]:
# by GPU type
df_summarize_resource_gpu = data_df_2.groupby(['plan_gpu','plan_cpu', 'plan_mem', 'gpu_type']).count()
df_summarize_resource_gpu = df_summarize_resource_gpu.drop(columns=['inst_num', 'status', 'start_time', 'end_time',
       'inst_id', 'user', 'status_j', 'start_time_j', 'end_time_j', 'runtime',
       'status_i', 'start_time_i', 'end_time_i', 'runtime_i', 'duration_min',
       'wait_time', 'start_date', 'gpu_type_spec', 'group', 'workload',
       'shared'])
df_summarize_resource_gpu = df_summarize_resource_gpu.reset_index()

df_summarize_resource_gpu = df_summarize_resource_gpu.rename(columns={"job_name": "count"})
df_summarize_resource_gpu['marker_size'] = df_summarize_resource_gpu.apply(lambda x: set_marker_size(x), axis=1)

fig = px.scatter_3d(df_summarize_resource_gpu, x='plan_gpu', y='plan_cpu', z='plan_mem',
          color='gpu_type', hover_data=["count"], opacity=0.7, size = "marker_size", width=1200, height=800)
fig.show()

Estimate wait time using Machine Learning methods¶

We preprocess the data by selecting all the planned resources and the hour task initiated. We observed some wait times being negative, which is not possible, so we removed those rows with negative wait times. Then we drop NA and split the data for training.

In [14]:
data_df.wait_time.describe()
Out[14]:
count    1.165653e+06
mean     3.562137e+02
std      5.199093e+03
min     -5.134800e+04
25%      3.000000e+00
50%      8.000000e+00
75%      1.300000e+01
max      5.998170e+05
Name: wait_time, dtype: float64
In [15]:
from sklearn.model_selection import train_test_split

# Preprocess data
df = data_df
df = df[df.wait_time >= 0]
# Only use completed tasks
df = df[df.status == "Terminated"]
df = add_hour_date(df)
# Select columns that related to wait time
df_train = df[["task_name", "plan_cpu", "plan_mem", "plan_gpu", "gpu_type", "hour", "wait_time"]]
df_train = df_train.dropna()
df_train = pd.get_dummies(data=df_train, drop_first=True)
In [16]:
# Out put wait time
Y = df_train['wait_time']

# Input
X = df_train.drop(['wait_time'], axis=1)

Linear regression¶

We would like first to try the linear regression model to estimate the wait time. Because linear regression is simple to understand, and based on our exploration, some variables seems to correlate with the score linearly, so we would like to try the linear regression model first.

In [17]:
# Train the model
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=101)
model = LinearRegression()
model.fit(X_train,y_train)
Out[17]:
LinearRegression()

The plot of regression line

In [18]:
from sklearn.metrics import mean_squared_error
predictions1 = model.predict(X_test)
g = sns.regplot(x=list(y_test),y=predictions1)
g.set(xlabel='True wait time')
g.set(ylabel='Predicted wait time')
Out[18]:
[Text(0, 0.5, 'Predicted wait time')]
In [19]:
# RMSE
mean_squared_error(y_test, predictions1, squared=False)
Out[19]:
4010.3750400876725

Random Forest regressor¶

The second model we want to try is RandomForest. RandomForest fits several classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. We chose this algorithm because there could have some non-linearity between each variable and the wait time, and the decision tree could capture those non-linearities. Further, random forests could reduce variance by training on different samples of the data.

In [20]:
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=101)
model = RandomForestRegressor()
model.fit(X_train,y_train)
Out[20]:
RandomForestRegressor()
In [21]:
from sklearn.metrics import mean_squared_error
predictions2 = model.predict(X_test)
g = sns.regplot(x=list(y_test),y=predictions2)
g.set(xlabel='True wait time')
g.set(ylabel='Predicted wait time')
Out[21]:
[Text(0, 0.5, 'Predicted wait time')]
In [22]:
# RMSE
mean_squared_error(y_test, predictions2, squared=False)
Out[22]:
4072.746099120997